using <- function(...) {
libs <- unlist(list(...))
need <- libs <- libs[!unlist(lapply(libs, require, character.only = T))]
if(length(need) > 0){
install.packages(need, repos = "https://cloud.r-project.org")
need <- need[!unlist(lapply(need, require, character.only = T))]
if (length(need) > 0) {
if (!requireNamespace("BiocManager", quietly = T))
install.packages("BiocManager")
BiocManager::install(need)
}
}
lapply(libs, require, character.only = T)
}
using("data.table", "tidyverse", "lattice", "gridExtra",
"rtracklayer", "DiffBind", "idr2d", "patchwork")We read in MACS peak calls and select coordinate columns as well as the p-value column, then run IDR
idr <- c('KO','KO2') %>%
paste0('_H3K27ac_peaks.narrowPeak') %>%
lapply(fread, select = c(1:3,8)) %>%
{estimate_idr1d(.[[1]], .[[2]], value_transformation = 'identity')} %>%
.$rep1_df
ggplot(idr, aes(x = rank, y = rep_rank, color = idr)) +
geom_point(size = .1) + scale_color_gradientn(colors = rainbow(10)) +
facet_grid(.~'rank') + theme(legend.position = 'none') +
ggplot(idr, aes(x = value, y = rep_value, color = idr)) +
geom_point(size = .1) + scale_color_gradientn(colors = rainbow(10)) +
facet_grid(.~'value')We’ll take the outputs of different peak callers and compare them
You could take the raw outputs and re-arrange to more a more friendly form
You don’t need to run this chunk in class
pks <- lapply(list.files('peaks', full.names = T), function(x) {
fread(x) %>%
mutate(diff = .[[3]] - .[[2]]) %>%
pull(diff)
}) %>%
setNames(list.files('peaks')) %>%
melt() %>%
mutate(pheno = sub('_.*', '', L1),
mark = sub('^[^_]*_(.*)(_peaks.*|\\.txt)', '\\1', L1),
caller = case_when(
grepl('txt', L1) ~ "SICER",
grepl('narrow', L1) ~ "MACS2 narrow",
grepl('broad', L1) ~ "MACS2 broad",
TRUE ~ "NA"))Which has already been done and can be found in the provided file
load('lec10.RData')We can then perform some basic comparisons
# size of individual intervals
ggplot(pks, aes(x = pheno, y = value, fill = caller)) +
labs(x = "Condition", y = "Peak/domain size") +
geom_violin(position = position_dodge(.8)) +
geom_pointrange(stat = "summary",
fun.min = function(v) quantile(v, .25),
fun.max = function(v) quantile(v, .75),
fun = median, fatten = .8,
position = position_dodge(.8)) +
scale_y_log10() + facet_grid(. ~ mark) +
theme(legend.position = 'bottom')
# total size covered by enriched intervals
pks %>%
group_by(pheno, mark, caller) %>%
summarise(bases = sum(value)) %>%
ggplot(aes(x = pheno, y = bases, fill = caller)) +
labs(x = "Phenotype", y = "Bases in peaks") +
geom_col(position = "dodge") +
scale_y_log10(expand = expansion(c(0,.05))) +
facet_grid(. ~ mark) +
theme(legend.position = 'bottom')We can take the called peaks and merge them to obtain a reference set of genomic intervals – into which we can count reads into just as we’ve done with genes. This can be done easily with DiffBind using a sample sheet containing the locations and BAM / peak files as well as some metadata.
You don’t need to run this chunk in class
dat <- dba(sampleSheet = "samples.csv")
dat <- dba.count(dat)But since the counting process takes a bit of time, we’ll start with the pre-computed object (already loaded from lec10.RData). As with transcriptomic and methylomic matrices, we can again perform routine analyses such as PCA and hierchical clustering
dba.plotPCA(dat, label = DBA_CONDITION)dba.plotHeatmap(dat, correlations = T)dba.plotHeatmap(dat, correlations = F)When we have many regions of interest across the genome, we can assess the distribution of signals surrounding such areas and evaluate the overall trend. One tool for performing this is deepTools, and we can examine the results of their computeMatrix module.
You don’t need to run this chunk in class
library(parallel)
mats <- mclapply(list.files('mats', full.names = T), function(x)
colMeans(fread(x, skip = 1, header = F, drop = 1:6),
na.rm = T), mc.cores = 12) %>%
setNames(list.files('mats'))
mat.ex <- fread('mats/KO_H3K27ac.rx.promoter.mat.gz', skip = 1, header = F)We’ll first take the KO sample’s H3K27ac enrichment surrounding promoters as an example
# use only the matrix part, throw away the coordinates info
dat.ex <- mat.ex[,-(1:6)] %>%
arrange(rowMeans(.)) %>%
as.matrix
# plot
levelplot(t(dat.ex), useRaster = T, xlab = "Position", ylab = "Interval",
scales = list(draw = F), col.regions = clrs,
at = seq(-2, 3, length.out = 100), aspect = "fill")Given the link between promoter acetylation and the cognate gene’s expression, we further incorporate RNA-seq data. Again, for the sake of time, it’s been imported and provided in lec10.RData
You don’t need to run this chunk in class
exps <- fread('BT245-KO-C4.UCSC.featureCounts.txt.counts')
library(biomaRt)
ensembl <- useMart(biomart = "ENSEMBL_MART_ENSEMBL",
host = "grch37.ensembl.org",
path = "/biomart/martservice",
dataset = "hsapiens_gene_ensembl")
res <- getBM(attributes = c("external_gene_name", "chromosome_name",
"start_position", "end_position"),
filters = "external_gene_name", values = exps$Geneid,
mart = ensembl)
dict <- left_join(res[!duplicated(res$external_gene_name) &
res$external_gene_name %in% exps$Geneid,],
exps[exps$Geneid %in% res$external_gene_name],
by = c("external_gene_name" = "Geneid")) %>%
filter(chromosome_name %in% c(1:22, "X", "Y"))With the expresion data in hand, we now combine the two modalities
# get promoters intervals from matrix
pmts <- makeGRangesFromDataFrame(mat.ex,
seqnames.field = 'V1',
start.field = 'V2',
end.field = 'V3')
# get gene intervals overlapping promoters
dict$chr <- paste0("chr", dict$chromosome_name)
genes <- makeGRangesFromDataFrame(dict,
start.field = "start_position",
end.field = "end_position",
seqnames.field = "chr",
keep.extra.columns = T)
pmts.ol <- pmts[overlapsAny(pmts, genes)]
genes.ol <- genes[overlapsAny(genes, pmts)]
# partition genes into quintiles by expression
ranks <- rank(genes.ol$counts/abs((end(genes.ol) - start(genes.ol))),
ties.method = "first")
genes.ol$cut <- as.numeric(cut(ranks, quantile(ranks, probs = seq(0, 1, 0.2)),
include.lowest = TRUE))
pmts.cut <- data.frame(V4 = sprintf("%s:%d-%d", seqnames(pmts.ol),
start(pmts.ol),end(pmts.ol)),
cut = genes.ol$cut[findOverlaps(pmts.ol, genes.ol,
select = "first")])
# integrate
dat.ex <- left_join(pmts.cut, mat.ex[,-c(1:3, 5, 6)]) %>%
select(-V4) %>%
arrange(cut, rowMeans(select(., -cut))) %>%
split(., .$cut)We can plot the results as heatmaps again
# plot heatmap for each quintile
lapply(1:5, function(i) {
levelplot(t(dat.ex[[i]][,2:601]), useRaster = T,
xlab = NULL, ylab = NULL, scales = list(draw = F),
col.regions = clrs, aspect = "fill",
at = seq(-2, 3, length.out = 100),
colorkey = F, margin = F)
}) %>% grid.arrange(grobs = ., ncol = 1)Or alternatively take the column-wise average across said heatmap as a summary
lapply(dat.ex, function(x) colMeans(x[,2:601])) %>%
melt() %>%
rownames_to_column("row") %>%
mutate(row = (as.numeric(row) - 1) %% (n()/5) + 1) %>%
ggplot(aes(x = row, y = value, color = L1)) +
annotate("rect", xmin = 200, xmax = 400, ymin = -Inf,
ymax = Inf, alpha = 0.2, fill = "grey") +
geom_line() +
labs(y = "Enrichment", title = "H3K27ac in promoters",
color = "Expression level") +
scale_x_continuous(breaks = c(1, 200 * 1:3),
labels = c("-2kb", "Start", "End", "+2kb"),
expand = c(0, 0), name = "Position") +
theme_bw() + theme(legend.position = "bottom")Or more directly we can just assess the relationship per-promoter
ols <- findOverlaps(pmts.ol, genes.ol, select = "first")
genes.ol$len <- abs(end(genes.ol) - start(genes.ol)) / 1000
fac <- sum(genes.ol$counts / genes.ol$len) / 1e6
pmts.cut$tpm <- (genes.ol$counts[ols] / genes.ol$len[ols]) / fac
dat.ex <- left_join(pmts.cut, mat.ex[,-c(1:3, 5, 6)]) %>%
select(-c(V4, cut)) %>%
mutate(rm = rowMeans(select(., 202:401))) %>%
select(tpm, rm)
smoothScatter(x = log2(dat.ex$tpm + 1), y = dat.ex$rm,
xlab = "Expression", ylab = "H3K27ac")We’ve been only looking at one sample thus far, and to contrast difference samples we’ll need to make sure the scales are meaningfully comparable
Taking our previous promoter aggregate plots, different methods would point to rather divergent conclusions
# define groups
ann <- tibble(f = names(mats)) %>%
separate(f, c('samps', 'marks', 'norms', 'regs', NA, NA), '[_\\.]', F)
# what to plot
mark <- 'H3K27ac'
reg <- 'promoter'
# label groups
dats <- melt(mats[ann$f[ann$marks == mark & ann$regs == reg]]) %>%
rownames_to_column("row") %>%
mutate(row = (as.numeric(row) - 1) %% (n()/6) + 1) %>%
merge(dplyr::rename(ann, L1 = f)) %>%
mutate(norms = factor(norms, levels = c('raw', 'inp', 'rx'))) %>%
select(-L1)
# label axis
len <- nrow(dats) / 6 / 3
if (len * 10 < 1000) {
lab.l <- sprintf("-%db", len)
lab.r <- sprintf("+%db", len)
} else {
lab.l <- sprintf("-%dkb", len / 100)
lab.r <- sprintf("+%dkb", len / 100)
}
# plot
ggplot(dats, aes(x = row, y = value, color = samps)) +
annotate("rect", xmin = len, xmax = len * 2, ymin = -Inf,
ymax = Inf, alpha = 0.2, fill = "grey") +
geom_line() +
labs(y = "Enrichment", title = sprintf("%s @ %s", mark, reg)) +
scale_x_continuous(breaks = c(1, len * 1:3),
labels = c(lab.l, "Start", "End", lab.r),
expand = c(0, 0), name = "Position") +
facet_wrap(. ~ norms, scales = "free") + theme_bw()As opposed to limiting ourselves to some set of pre-defined regions, we could also just take the counts in uniformly divided windows across the genome, which is stored cts object (a list of bedGraph files imported like below)
You don’t need to run this chunk in class
cts <- lapply(list.files('10kb', full.names = T), import.bedGraph) %>%
setNames(sub('.10kb.bed', '', list.files('10kb')))We now apply various normalization factors
# read in scaling factors
facs <- fread('https://hgen663.me/rx.csv')| condition | mark | hs_chip | hs_input | dm_chip | dm_input |
|---|---|---|---|---|---|
| KO | H3K9me3 | 56785959 | 34362619 | 1202438 | 1936047 |
| KO | H3K27ac | 25379930 | 34362619 | 3671901 | 1936047 |
| KO | H3K36me2 | 46933048 | 34362619 | 428509 | 1936047 |
| KO | H3K27me3 | 46474170 | 34362619 | 3752386 | 1936047 |
| K27M | H3K9me3 | 60483553 | 30721591 | 1798155 | 1334441 |
| K27M | H3K27ac | 35887148 | 30721591 | 2601095 | 1334441 |
| K27M | H3K36me2 | 65841940 | 30721591 | 785877 | 1334441 |
| K27M | H3K27me3 | 35038431 | 30721591 | 14848754 | 1334441 |
facs <- facs %>%
mutate(sample = paste(facs$condition, facs$mark, sep = "_")) %>%
rowwise() %>%
mutate(ref = min(hs_chip, hs_input)) %>%
ungroup() %>%
mutate(depthfac1 = ref / hs_chip,
depthfac2 = ref / hs_input,
rx = (hs_chip / hs_input) / (dm_chip / dm_input),
scalefac1 = rx * depthfac1)| condition | mark | hs_chip | hs_input | dm_chip | dm_input | sample | ref | depthfac1 | depthfac2 | rx | scalefac1 |
|---|---|---|---|---|---|---|---|---|---|---|---|
| KO | H3K9me3 | 56785959 | 34362619 | 1202438 | 1936047 | KO_H3K9me3 | 34362619 | 0.6051253 | 1.0000000 | 2.6607735 | 1.6101013 |
| KO | H3K27ac | 25379930 | 34362619 | 3671901 | 1936047 | KO_H3K27ac | 25379930 | 1.0000000 | 0.7385913 | 0.3894297 | 0.3894297 |
| KO | H3K36me2 | 46933048 | 34362619 | 428509 | 1936047 | KO_H3K36me2 | 34362619 | 0.7321625 | 1.0000000 | 6.1708992 | 4.5181011 |
| KO | H3K27me3 | 46474170 | 34362619 | 3752386 | 1936047 | KO_H3K27me3 | 34362619 | 0.7393918 | 1.0000000 | 0.6978045 | 0.5159509 |
| K27M | H3K9me3 | 60483553 | 30721591 | 1798155 | 1334441 | K27M_H3K9me3 | 30721591 | 0.5079330 | 1.0000000 | 1.4610526 | 0.7421168 |
| K27M | H3K27ac | 35887148 | 30721591 | 2601095 | 1334441 | K27M_H3K27ac | 30721591 | 0.8560611 | 1.0000000 | 0.5992919 | 0.5130305 |
| K27M | H3K36me2 | 65841940 | 30721591 | 785877 | 1334441 | K27M_H3K36me2 | 30721591 | 0.4665961 | 1.0000000 | 3.6391815 | 1.6980278 |
| K27M | H3K27me3 | 35038431 | 30721591 | 14848754 | 1334441 | K27M_H3K27me3 | 30721591 | 0.8767970 | 1.0000000 | 0.1024968 | 0.0898689 |
# normalize
for (samp in facs$sample) {
# get ratios
rx <- facs$scalefac1[facs$sample == samp]
r1 <- facs$depthfac1[facs$sample == samp]
r2 <- facs$depthfac2[facs$sample == samp]
# scale by factor
num.input <- r1 * cts[[samp]]$score + 1
num.rx <- rx * cts[[samp]]$score + 1
denom <- r2 * cts[[sub('_.*', '_input', samp)]]$score + 1
cts[[samp]]$rx <- log2(num.rx / denom)
cts[[samp]]$input <- log2(num.input / denom)
}And take H3K27me3 as an example
# compare counts
mark <- 'H3K27me3'
samp1 <- paste('KO', mark, sep = '_')
samp2 <- paste('K27M', mark, sep = '_')
smoothScatter(x = log2(cts[[samp1]]$score + 1),
y = log2(cts[[samp2]]$score + 1),
xlab = samp1, ylab = samp2, main = "Raw")
abline(0,1)Or as MA plots
ma <- list()
ma[["raw"]] <- data.frame(m = log(cts[[samp1]]$score) - log(cts[[samp2]]$score),
a = 0.5 * (log(cts[[samp1]]$score) + log(cts[[samp2]]$score)))
ma[["input"]] <- data.frame(m = cts[[samp1]]$input - cts[[samp2]]$input,
a = 0.5 * (cts[[samp1]]$input + cts[[samp2]]$input))
ma[["rx"]] <- data.frame(m = cts[[samp1]]$rx - cts[[samp2]]$rx,
a = 0.5 * (cts[[samp1]]$rx + cts[[samp2]]$rx))
par(mfrow = c(1,3))
for(nm in names(ma)) {
smoothScatter(x = ma[[nm]]$a, y = ma[[nm]]$m, ylab = "M", xlab = "A",
main = sprintf("%s - %s (%s)", samp1, samp2, nm))
abline(h = 0)
}